fetch OBIS data using iobis/robis
# NOTE: This is not actually a MULTIPOLYGON; it's just a big rectangle.
# Any WKT string can be used.
roi <- "MULTIPOLYGON (((-175 -80, 165 -80, 165 75, -175 75, -175 -80)))"
# special use areas manually transcribed from images at
# https://floridakeys.noaa.gov/zones/special/welcome.html
# converted from min/degrees to decimal degrees using
# https://www.pgc.umn.edu/apps/convert/
# https://floridakeys.noaa.gov/zones/special/easternsambo.html
SPECIES_NAME <- "Balaenoptera musculus"
species_name_safe <- sanitize_query_id(SPECIES_NAME)
roi_query_id <- glue( "query_id_placeholder_{species_name_safe}")
if (has_cache(roi_query_id)){
occurrences <- load_cache(roi_query_id)
} else {
occurrences <- robis::occurrence(
SPECIES_NAME,
geometry = roi
)
save_cache(roi_query_id, occurrences)
}
explore occurrences in R
## Warning: Removed 106 rows containing non-finite values (stat_count).

library(ggplot2)
ggplot() +
geom_histogram(
data=occurrences,
aes(x=year, fill=datasetID),
binwidth=5) +
scale_fill_brewer(palette='Paired')
## Warning: Removed 10618 rows containing non-finite values (stat_bin).

robis::map_leaflet(occurrences)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks glue::collapse()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(leaflet)
color = "#ff3399"
provider_tiles = "OpenStreetMap.BlackAndWhite"
popup = function(x) { x["id"] }
p <- apply(occurrences, 1, popup)
# catagorical color palette
factpal <- colorFactor(topo.colors(5), occurrences$datasetID)
leaflet(occurrences) %>%
addProviderTiles(provider_tiles) %>%
addCircleMarkers(
~decimalLongitude,
~decimalLatitude,
popup = p,
radius = 3,
weight = 1,
color = ~factpal(occurrences$datasetID),
opacity = 1,
fillOpacity = 0.1
# TODO: how to add legend?
)
explore occurrences in python
from pprint import pprint
# print head of records from R
type(r.occurrences)
#r.occurrences.head()
## <class 'pandas.core.frame.DataFrame'>
pprint(r.occurrences.columns)
## Index([u'X', u'date_year', u'scientificNameID', u'scientificName',
## u'superfamilyid', u'individualCount', u'dropped',
## u'associatedReferences', u'aphiaID', u'decimalLatitude',
## ...
## u'parentEventID', u'samplingEffort', u'references', u'endDayOfYear',
## u'dataGeneralizations', u'georeferenceProtocol', u'locationID',
## u'georeferenceSources', u'lifeStage', u'depth'],
## dtype='object', length=130)
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame , read_csv
SELECTED_CATAGORICAL_COLUMNS = ['country', 'language', 'dropped', 'occurrenceStatus', 'modified', 'organismQuantity', 'sampleSizeUnit', 'dataset_id', 'license', 'organismQuantityType', 'scientificNameAuthorship', 'absence', 'vernacularName', 'specificEpithet', 'datasetID', 'category', 'sampleSizeValue', 'node_id', 'bibliographicCitation', 'continent', 'identificationReferences', 'fieldNumber', 'coordinateUncertaintyInMeters', 'taxonRemarks', 'behavior', 'informationWithheld', 'samplingEffort', 'dataGeneralizations']
TOP_N = 3 # show just the top N
fig, axes = plt.subplots(nrows=len(SELECTED_CATAGORICAL_COLUMNS), ncols=1, figsize=(8,len(SELECTED_CATAGORICAL_COLUMNS)*4))
for plot_i, column in enumerate(SELECTED_CATAGORICAL_COLUMNS):
print("plotting top {} values for column '{}'".format(TOP_N, column))
axis = axes[plot_i]
counts = r.occurrences[column].value_counts()
#print(list(counts.index)[:TOP_N])
#print(counts.values[:TOP_N])
sns.barplot(list(counts.index)[:TOP_N], counts.values[:TOP_N], alpha=0.8, ax=axis)
axis.title.set_text('Top {} "{}" for {}'.format(TOP_N, column, r.SPECIES_NAME))
#axis.ylabel.set_text('# Occurrences', fontsize=12)
#axis.xlabel.set_text(column, fontsize=12)

# print out columns we aren't looking at
unused_cols = r.occurrences.columns
for used_list in [SELECTED_CATAGORICAL_COLUMNS]:
unused_cols = [x for x in unused_cols if x not in used_list]
print("unhandled columns: ", unused_cols)
## ('unhandled columns: ', ['X', 'date_year', 'scientificNameID', 'scientificName', 'superfamilyid', 'individualCount', 'associatedReferences', 'aphiaID', 'decimalLatitude', 'subclassid', 'type', 'phylumid', 'familyid', 'catalogNumber', 'basisOfRecord', 'superclass', 'maximumDepthInMeters', 'id', 'order', 'recordNumber', 'georeferencedDate', 'superclassid', 'infraorderid', 'verbatimEventDate', 'decimalLongitude', 'date_end', 'collectionCode', 'speciesid', 'occurrenceID', 'superfamily', 'suborderid', 'date_start', 'organismID', 'dateIdentified', 'genus', 'ownerInstitutionCode', 'eventDate', 'taxonRank', 'genusid', 'originalScientificName', 'marine', 'minimumDepthInMeters', 'subphylumid', 'institutionCode', 'date_mid', 'eventTime', 'identificationRemarks', 'class', 'infraorder', 'suborder', 'nomenclaturalCode', 'orderid', 'footprintWKT', 'datasetName', 'taxonomicStatus', 'geodeticDatum', 'kingdom', 'classid', 'phylum', 'species', 'coordinatePrecision', 'subclass', 'subphylum', 'occurrenceRemarks', 'family', 'kingdomid', 'terrestrial', 'infraspecificEpithet', 'subspecies', 'brackish', 'waterBody', 'subspeciesid', 'rightsHolder', 'institutionID', 'year', 'day', 'month', 'eventID', 'locality', 'samplingProtocol', 'taxonConceptID', 'recordedBy', 'higherGeography', 'startDayOfYear', 'stateProvince', 'sex', 'georeferenceRemarks', 'dynamicProperties', 'identifiedBy', 'eventRemarks', 'otherCatalogNumbers', 'collectionID', 'typeStatus', 'footprintSRS', 'parentEventID', 'references', 'endDayOfYear', 'georeferenceProtocol', 'locationID', 'georeferenceSources', 'lifeStage', 'depth'])